Hierarchical clustering using cutreeDynamic to automatically cut the tree

cluster<-genecluster(ratio_psedo,nct,method="hierarchical")

table(cluster)
## cluster
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 412 319 283 245 201 190 185 183 130 123 116 115 112 107 100  85  80  66 
##  19  20  21  22 
##  45  35  35  25

Using mclust to do the clustering

cluster<-genecluster(ratio_psedo,nct,G=c(4,8,12,16,20,24))
## model-based optimal number of clusters: 16

table(cluster)
## cluster
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
## 541 357 295 952 207 194 134  85  24  47 118  87  48  41  29  33

Modelling for each cluster

R=200
start_time <- Sys.time()
f <- ratio ~ p(X, pen = "gflasso")
cl <- makeCluster(ncores) # use 4 workers
registerDoParallel(cl) # register the parallel backend
start_time <- Sys.time()
ans <- pbsapply(1:max(cluster), function(i) { # try on cluster 9th and 10th first
  set.seed(i)
  print(paste("cluster",i))
  feat<-which(cluster==i)
  pheatmap(ratio_psedo[feat,], cluster_rows = FALSE, cluster_cols = FALSE,
         annotation_col = anno_df,show_colnames = F,show_rownames = F)
  # create dataframe for each cluster
  cl_ratio<-as.vector(unlist(ratio[feat,]))
  cl_total<-as.vector(unlist(total_QCed[feat,]))
  data<-tibble(ratio=cl_ratio,X=factor(rep(celltype,each=length(feat)),levels = cell_meta_unique),cts=cl_total)
  
  t <- system.time(fit<-fusedlasso(formula=f,model="binomial",data,ncores=ncores))[[3]] # saving the elapsed time
  t2 <- system.time(fit2<-fusedlasso(formula=f,model="gaussian",data,ncores=ncores))[[3]] # saving the elapsed time
  
  ## bootstrap 200 replicates
  t3<-system.time(boot <- boot(data,statistic=boot_fusedlasso,R=R,strata=data$X,formula=f,model="binomial",lambda1=fit$lambda,parallel="multicore",ncpus=ncores))[[3]]
  t4<-system.time(boot2 <- boot(data,statistic=boot_fusedlasso,R=R,strata=data$X,formula=f,model="gaussian",lambda2=fit2$lambda,parallel="multicore",ncpus=ncores))[[3]]
  ## boxplot of bootstrap allelic ratio estimator
  bootbox<-as_tibble(rbind(boot$t,boot2$t)) 
  colnames(bootbox)<-cell_meta_unique
  bootbox2<-tibble(bootbox,model=rep(c("bin","gau"),each=R))
  bootbox3<-bootbox2 %>% gather(key="type",value = "ratio",zy:lateblast)

  ggplot(data = bootbox3, mapping = aes(x=type, y=ratio, fill=model,col=model)) + 
  geom_boxplot()+ theme_classic()+
  geom_jitter(shape=16, position=position_jitter(0.2),size=0.7)+
  ylab("allelic ratio")+
  xlab("")+scale_x_discrete(name ="Cell types", 
                   limits=cell_meta_unique)
  ## consensus partation #################
  bootcl <-apply(bootbox, 1, function(x) fmatch(x,unique(x)))
  consens_par<-cl_consensus(cl_ensemble(list = apply(bootcl[,1:R],2,as.cl_hard_partition)),method = "SM")
  class <- max.col(consens_par$.Data)
  consens_par<-cl_consensus(cl_ensemble(list = apply(bootcl[,(R+1):(2*R)],2,as.cl_hard_partition)),method = "SM")
  class2 <- max.col(consens_par$.Data)
  
  ## confidence interval ####
  bootci<-apply(boot$t,2, function(x) quantile(x,probs = c(0.025,0.975))) #percentil bootstrap ci
  bootci2<-matrix(rep(2*boot$t0,each=2),nrow = 2)-bootci #type 2 percentile
  ratio_bc<-boot$t0 #estimator
  
  boot2ci<-apply(boot2$t,2, function(x) quantile(x,probs = c(0.025,0.975))) #percentil bootstrap ci
  boot2ci2<-matrix(rep(2*boot2$t0,each=2),nrow = 2)-boot2ci #type 2 percentile
  ratio_bc2<-boot2$t0 #estimator
  
  out<-list()
  out[["time"]]<-c(t,t2,t3,t4)
  out[["ratio"]]<-rbind(ratio_bc,ratio_bc2)
  out[["ci"]]<-rbind(bootci2,boot2ci2)
  out[["consensus"]]<-rbind(class,class2)
  return(out)
  }, cl=1)
## [1] "cluster 1"
## Warning: The `x` argument of `as_tibble.matrix()` must have column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

## [1] "cluster 2"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## [1] "cluster 3"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## [1] "cluster 4"
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## [1] "cluster 5"

## [1] "cluster 6"
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## [1] "cluster 7"

## [1] "cluster 8"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## [1] "cluster 9"

## [1] "cluster 10"

## [1] "cluster 11"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## [1] "cluster 12"

## [1] "cluster 13"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## [1] "cluster 14"

## [1] "cluster 15"

## [1] "cluster 16"
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

end_time <- Sys.time()
end_time - start_time
## Time difference of 1.256826 hours
stopCluster(cl)
save(ans,file = "/proj/milovelab/mu/SC-ASE/data/deng.rda")